Imports¶

In [119]:
#!pip install pybaseball

from pybaseball import statcast, statcast_batter, playerid_lookup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import unidecode
from datetime import datetime

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder

from scipy.stats import multivariate_normal
from scipy.spatial.distance import mahalanobis


from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.mixture import GaussianMixture
from sklearn.metrics import pairwise_distances_argmin_min

import warnings
warnings.filterwarnings('ignore')
In [57]:
# data2021 = statcast('2021-04-01', '2021-10-03', parallel = True)
# data2022 = statcast('2022-03-31', '2022-10-02', parallel = True)
# data2023 = statcast('2023-03-30', '2023-10-01', parallel = True)

data2021 = pd.read_csv('statcast21.csv')
data2022 = pd.read_csv('statcast22.csv')

Cleaning¶

Eliminating Position Players¶

In [ ]:
sorted_names = data2021.player_name.value_counts(sort=True).index
pitch_totals = data2021.player_name.value_counts(sort=True)

#for i,j in zip(sorted_names,pitch_totals):
#    if j <= 150:
#        print(i,j)
In [ ]:
# Names of position players to leave out of dataset
names = "".join(["Blandino, Alex,León, Sandy,Sogard, Eric,Astudillo, Willians,Peterson, Jace,Torreyes, Ronald,Mercedes, Yermín,",
         "Castro, Harold,Peralta, David,Almora Jr., Albert,Mejía, Francisco,Drury, Brandon,Tauchman, Mike,Burns, Andy,",
         "Phillips, Brett,Eaton, Adam,Bemboom, Anthony,Ruf, Darin,Maile, Luke,Tom, Ka'ai,Luplow, Jordan,Plawecki, Kevin,",
        "Rogers, Jake,Reddick, Josh,Moreland, Mitch,Cabrera, Asdrúbal,Guillorme, Luis,Holaday, Bryan,Pérez, Hernán,",
         "Wilkerson, Stevie,Arroyo, Christian,Rizzo, Anthony,Romine, Andrew,Mendick, Danny,Schrock, Max,",
         "Culberson, Charlie,Araúz, Jonathan,Gonzalez, Marwin,Turner, Justin,Robertson, Daniel,Holt, Brock,",
         "Mathisen, Wyatt,Wynns, Austin,Lopez, Nicky,Moran, Brian,Owings, Chris,Reyes, Pablo,Difo, Wilmer,",
        "Fuentes, Joshua,Valaika, Pat,Cronenworth, Jake,Pillar, Kevin,Slater, Austin,Evans, Phillip,Duffy, Matt,",
        "Alcántara, Sergio,Knapp, Andrew,Freeman, Mike,Gonzalez, Romy,Maton, Nick,Mayfield, Jack,Alberto, Hanser,",
        "Clemens, Kody,González, Luis,Gordon, Nick,VanMeter, Josh,Bethancourt, Christian,Brosseau, Mike,Myers, Wil,",
        "Clement, Ernie,Schwindel, Frank,Molina, Yadier,Taylor, Michael A.,Neuse, Sheldon,Strange-Gordon, Dee,",
        "McKenna, Ryan,Bradley Jr., Jackie,Pujols, Albert,Freeman, Caleb,Pinder, Chad,Chang, Yu,Batten, Matthew,",
        "Simmons, Andrelton,Escobar, Alcides,Dozier, Hunter,Walton, Donovan,Reynolds, Matt,Torrens, Luis,Dickerson, Corey,",
         "Barnhart, Tucker,Caratini, Victor,Merrifield, Whit,Adrianza, Ehire,Knizner, Andrew,Grichuk, Randal,Serven, Brian,",
        "Palacios, Jermaine,McGuire, Reese,Vargas, Ildemaro,Hall, Darick,Gosselin, Phil,Nogowski, John,Stubbs, Garrett,",
        "Sánchez, Yolmer,Arcia, Orlando,Ford, Mike,Hager, Jake,Harrison, Josh,Kelly, Carson,Lopez, Alejo,",
        "Carpenter, Matt,Reyes, Franmil,García, Robel"]).split(',')

# Join concatted string into array of names as they appear in Statcast name column
pos_players = [x+ ',' +y for x,y in zip(names[0::2], names[1::2])]
In [ ]:
# Group by statements to find # pitches and velocity numbers for position players
data21_grps = data2021.groupby('player_name').agg(max_velo =('release_speed', np.max),
                                               med_velo =('release_speed', np.median),
                                               avg_velo =('release_speed', np.mean),
                                               num_pitches = ('player_name', np.count_nonzero))
data22_grps = data2022.groupby('player_name').agg(max_velo =('release_speed', np.max),
                                               med_velo =('release_speed', np.median),
                                               avg_velo =('release_speed', np.mean),
                                               num_pitches = ('player_name', np.count_nonzero))

data21_grps['Max-Avg Diff'] = data21_grps['max_velo'] - data21_grps['avg_velo']
data22_grps['Max-Avg Diff'] = data22_grps['max_velo'] - data22_grps['avg_velo']
In [ ]:
# Filter out to find all position players

#data21_grps[((data21_grps.num_pitches <= 200) & (data21_grps['med_velo'] < 85)) |
#            ((data21_grps.num_pitches <= 200) & (data21_grps['Max-Avg Diff'] > 12))|
#           ((data21_grps.num_pitches <= 200) & (data21_grps['max_velo'] < 90))]

#data22_grps[((data22_grps.num_pitches <= 200) & (data22_grps['med_velo'] < 85)) |
#            ((data22_grps.num_pitches <= 200) & (data22_grps['Max-Avg Diff'] > 12))|
#           ((data21_grps.num_pitches <= 200) & (data21_grps['max_velo'] < 90))]
In [ ]:
#(Recursively) Eliminate position players from dataset, update dataset

#data21 = data2021[~data2021.player_name.isin(pos_players)]
#data22 = data2022[~data2022.player_name.isin(pos_players)]

#data22.to_csv('statcast21.csv',index=False)
#data21.to_csv('statcast22.csv',index=False)

NA Values¶

In [58]:
# Define a function to fill NaN values within groups
def fillna_by_pitcher(df, cols):
    '''
    Description: Fills NA values (pitch metrics), applied by pitch type per pitcher
    --------------------------------------------------------------------------------
    Inputs: df, cols
    
    Returns: df
        NA columns filled
    '''
    
    # For each column, take mean of column within dataframe, fill NA values with mean
    for i in cols:
        mean = df[i].mean()
        df[i].fillna(mean,inplace = True)
    
    return df
In [59]:
def clean_train_data(df):
    '''
    Description: Cleans training data, filters dataframe for relevant features, 
    removes non-pitches and fills in NA values for each unique pitch for all pitchers
    --------------------------------------------------------------------------------
    Inputs: df
    
    Returns: df_clean
        Cleaned input df
    '''
    # Define relevant feature columns, values to remove, columns with NA values to fill
    non_pitches = ['FA','PO']
    
    y = ['delta_run_exp']
    
    context_features = ['player_name','p_throws','batter','stand','pitch_type','pitch_number',
            'home_team','game_date','game_pk','at_bat_number',
            'balls','strikes', 'outs_when_up','on_3b', 'on_2b', 'on_1b']
    
    cont_features = ['release_speed','release_extension','effective_speed','release_spin_rate',
            'release_pos_x', 'release_pos_y', 'release_pos_z','spin_axis', 'pfx_x', 'pfx_z',
            'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot']
    
    features = y + context_features + cont_features
    
    # Drop: game_pk, player_name, batter, game_date

    
    # Filter Dataframe for features
    df = df[features]
    
    # Remove pitchouts/non-pitches, pitches with 0 movement (Statcast errors)
    df_filt = df[(~df.pitch_type.isin(non_pitches)) & ((df.pfx_x != 0.0) & (df.pfx_z != 0.0))]
    
    # Define columns to fill or drop if NA
    fill_cols = ['release_speed','release_extension','effective_speed','release_spin_rate',
                 'release_pos_x','release_pos_y','release_pos_z','spin_axis']

    na_cols = ['delta_run_exp','pitch_type','pitch_number','pfx_x','pfx_z',
               'release_pos_x', 'release_pos_y', 'release_pos_z', 
               'release_speed','release_extension','effective_speed','release_spin_rate',
               'spin_axis','sz_top', 'sz_bot']
    
    # Fill in NA values for each pitch and pitcher with mean of each column for each unique 
    df_filled = df_filt.groupby(['player_name','pitch_type']).apply(fillna_by_pitcher, cols = fill_cols)
    df_clean = df_filled.dropna(subset=na_cols)
    
    # OHC Base variables to 0 and 1
    df_clean[['on_1b','on_2b','on_3b']] = df_clean[['on_1b','on_2b','on_3b']].notna().astype(int)
    
    # Sirt dataframe by pitches in chronological order, return
    df_clean = df_clean.sort_values(['game_date','game_pk','at_bat_number','pitch_number'])
    return df_clean
In [60]:
def add_new_features(df, season_start):
    '''
    Description: Adds new features in dataframe
        - inferred_axis: Inferred Spin Axis (SSW Effects)
        - axis_diff: Difference of Inferred and Observed Spin Axis
        - game_week: Change game date to week of season depending on start date of season
        - pitch_count: Pitch # of outing for each outing per pitcher 
    --------------------------------------------------------------------------------
    Inputs: df, season_start (str)
    
    Returns: df
        Dataframe with new features added
    '''
    
    # inferred_axis: 180 / pi * atan(pfx_z / pfx_x) + 90 (where pfx_x is < 0, add 180 degrees.)
    df['inferred_axis'] = np.degrees(np.arctan(df['pfx_z'] / df['pfx_x'])) + 90
    df.loc[df['pfx_x'] < 0, 'inferred_axis'] += 180
    df['axis_diff'] = df['spin_axis'] - df['inferred_axis']
    
    # axis_diff: spin_axis - inferred_axis
    
    # Pitch Count: Cumulative pitch number of outing for pitcher
    df['pitch_count'] = df.sort_values(
    ['game_date','game_pk','at_bat_number','pitch_number']).groupby(
    ['game_date','game_pk','player_name']).cumcount() + 1
    
    # Create game_week column, where week of season is taken from game_date in Savant
    start_date = datetime.strptime(season_start, '%Y-%m-%d').date()
    
    df['datetime'] = pd.to_datetime(df['game_date'])
    df['game_week'] = df.apply(lambda x: ((x['datetime'].date() - start_date).days // 7) + 1, axis = 1)
    df = df.drop('datetime',axis=1)
    
    return df
In [61]:
data21 = clean_train_data(data2021)
data22 = clean_train_data(data2022)
In [62]:
data21 = add_new_features(data21,'2021-04-01')
data22 = add_new_features(data22,'2022-03-31')

EDA¶

Feature Exploration- Pitch Type Frequency¶

Aggregate Counts, Cumulative Relative Frequency of Pitch Type Over Season¶

In [63]:
def plot_lineplot(dataset, xvar, yvar, hue_var):
    '''
    Description: Plots lineplot, adds labels of hue variable at end of each line
    --------------------------------------------------------------------------------
    Inputs: dataset, x_var, y_var, hue_var
    
    Returns: None
    '''
    plt.figure(figsize = (20,10))
    plot = sns.lineplot(data = dataset, x = xvar,y= yvar,hue = hue_var, palette = 'tab10')
    
    # Add labels to the lines, each at the last x-value and corresponding y-value 
    for line in plot.lines:
        x, y = line.get_data()
        
        if len(x) == 0 and len(y) == 0:
            pass
        else:
            label = dataset[dataset[yvar] == y[-1]][hue_var].values[-1]
            plt.text(x[-1], y[-1], label, ha='right', va='bottom', fontsize=12, color=line.get_color())
In [64]:
def get_aggregate_count(data, player, year, plot = True):
    '''
    Description: Aggregates data for all pitch_types, cumulative count for each date,
    plots lineplot of aggregate data with labels
    --------------------------------------------------------------------------------
    Inputs: data (dataframe), player (str), year (str/int), plot (boolean, default True)
    
    Returns: pitch_aggs (dataframe)
    '''
    # Reverses order of data, changes date to datetime type instead of str
    data = data[::-1].reset_index(drop=True)
    data['game_date'] = pd.to_datetime(data['game_date'])
    
    # Groups input data by pitch_type and game_date for each date 
    # (includes 0 for pitches not thrown on a specific day)
    pitch_aggs = pd.DataFrame(data.groupby(['pitch_type','game_date']).size().unstack(
     fill_value=0)).reset_index().melt(id_vars='pitch_type', var_name='game_date', value_name='count')
    
    #  Cumulatively sums pitch type counts ober each unique day
    pitch_aggs['Cumulative_Count'] = pitch_aggs.groupby('pitch_type')['count'].cumsum()
    
    # Plots lineplot for data if plot set to true
    if plot:
        plot_lineplot(pitch_aggs, 'game_date', 'Cumulative_Count', 'pitch_type')

        # Format the x-axis date labels (optional)
        plt.xticks(rotation=45)

        # Set labels and title
        plt.xlabel('Date')
        plt.ylabel('Aggregate Count')
        plt.title(f'Aggregate Counts by Pitch Type by Game Date, {player}, {year}', fontsize=16)

        # Show the legend
        plt.legend(title='Pitch Type',loc="upper left")

        # Display the plot
        plt.show()
    
    return pitch_aggs
In [65]:
def get_cumulative_freq(agg_data, player, year, plot = True):
    '''
    Description: Aggregates data for all pitch_types' cumulative relative frequency for each date,
    plots lineplot of aggregate data with labels
    --------------------------------------------------------------------------------
    Inputs: agg_data (dataframe), player (str), year (str/int), plot (boolean, default True)
    
    Returns: pitch_aggs (dataframe)
    '''
    
    # For each game_date, cumulatively sum number of pitches thrown to find relative frequencies
    pitch_total_sum = agg_data.groupby('game_date')['count'].sum().cumsum().reset_index(name = 'season_total')
    
    # Merges input data (of cumulative count for pitch types), finds cumulative relative frequency
    # for each game_date
    merged_freq_df = agg_data.merge(pitch_total_sum, on = 'game_date',how='left')
    merged_freq_df['Cumulative_Rel_Freq'] = merged_freq_df['Cumulative_Count'] / merged_freq_df['season_total']

    # Plots data on lineplot if plot set to True
    if plot:
        plot_lineplot(merged_freq_df, 'game_date', 'Cumulative_Rel_Freq','pitch_type')
        plt.xticks(rotation=45)

        # Set labels and title
        plt.xlabel('Date')
        plt.ylabel('Cumulative Relative Frequency')
        plt.title(f'Cumulative Relative Frequency of Pitch Type, {player}, {year}', fontsize=16)

        # Show the legend
        plt.legend(title='Pitch Type',loc="upper left")

        # Display the plot
        plt.show()
    
    return merged_freq_df
In [66]:
all21_aggregates = get_aggregate_count(data21, 'All', 2021)
In [67]:
all21_rel_freq = get_cumulative_freq(all21_aggregates, 'All', 2021)
In [68]:
all22_aggregates = get_aggregate_count(data22, 'All', 2022)
In [69]:
all22_rel_freq = get_cumulative_freq(all22_aggregates, 'All', 2022)

Frequency by Handedness of Batter¶

In [70]:
data21_rhb = data21[data21.stand == 'R']
data21_rhb_agg = get_aggregate_count(data21_rhb, 'All RHB', 2021, False)
data21_rhb_cumul = get_cumulative_freq(data21_rhb_agg, 'All RHB', 2021)
In [71]:
data21_lhb = data21[data21.stand == 'L']
data21_lhb_agg = get_aggregate_count(data21_lhb, 'All LHB', 2021, False)
data21_lhb_cumul = get_cumulative_freq(data21_lhb_agg, 'All LHB', 2021)

Relative Frequency per Day¶

In [72]:
def get_relative_freq(data, player, year, plot = True):
    '''
    Description: Aggregates data for all pitch_types, finds relative frequency for each pitch  for each date,
    plots lineplot of aggregate data with labels
    --------------------------------------------------------------------------------
    Inputs: data (dataframe), player (str), year (str/int), plot (boolean, default True)
    
    Returns: pitch_aggs (dataframe)
    '''
    
    # Reorders data, changes game_date to type datetime
    data = data[::-1].reset_index(drop=True)
    data['game_date'] = pd.to_datetime(data['game_date'])

    # Aggregate, Reshape the DataFrame using melt
    rel_pitch_aggs = pd.DataFrame(data.groupby(['pitch_type','game_date']).size().unstack(
     fill_value=0)).reset_index().melt(id_vars='pitch_type', var_name='game_date', value_name='count')
    
    # Sort the DataFrame by 'pitch_type' and 'game_date' 
    rel_aggs_reshaped = rel_pitch_aggs.sort_values(by=['pitch_type', 'game_date']).reset_index(drop=True)
    
    # Get total pitches thrown for all game_dates
    total_pitches = data.groupby('game_date')['pitch_type'].count().reset_index(name='total')
    
    # Merge dataframes, calculate relative pitch frequency per day
    merged_df = rel_aggs_reshaped.merge(total_pitches, on='game_date', how='left')
    merged_df['Relative_Freq'] = merged_df['count'] / merged_df['total']
    
    # Plots lineplot if plot set to True
    if plot:
        plot_lineplot(merged_df, 'game_date','Relative_Freq','pitch_type')

        # Format the x-axis date labels (optional)
        plt.xticks(rotation=45)

        # Set labels and title
        plt.xlabel('Date')
        plt.ylabel('Relative Frequency')
        plt.title(f'Relative Pitch Frequencies by Pitch Type per Game Date, {player}, {year}', fontsize=16)

        # Show the legend
        plt.legend(title='Pitch Type',loc="upper left")

        # Display the plot
        plt.show()
    
    return merged_df
In [73]:
all21_relative = get_relative_freq(data21, 'All','2021')
In [74]:
all22_relative = get_relative_freq(data22, 'All','2022')

Individual Case¶

Shohei Ohtani¶

In [75]:
test_data = data21.copy()[::-1].reset_index(drop=True)
In [76]:
ohtani = test_data[test_data.player_name == 'Ohtani, Shohei']
In [77]:
ohtani_agg21 = get_aggregate_count(ohtani, 'Shohei Ohtani','2021')
In [78]:
ohtani21_cumul_freq = get_cumulative_freq(ohtani_agg21, 'Shohei Ohtani', 2021)
In [79]:
ohtani_rel21 = get_relative_freq(ohtani, 'Shohei Ohtani','2021')

Zack Wheeler¶

In [80]:
wheeler = test_data[test_data.player_name == 'Wheeler, Zack']
In [81]:
wheeler_agg21 = get_aggregate_count(wheeler, 'Zack Wheeler','2021', False)
wheeler_cumul_freq21 = get_cumulative_freq(wheeler_agg21,'Zack Wheeler', 2021)
In [82]:
wheeler_rel21 = get_relative_freq(wheeler, 'Zack Wheeler',' 2021')
In [83]:
wheeler_lhb21 = wheeler[wheeler.stand == 'L']
wheeler_rhb21 = wheeler[wheeler.stand == 'R']

wheeler_rhb_agg21 = get_aggregate_count(wheeler_rhb21, 'Zack Wheeler (RHB)','2021', False)
wheeler_rhb21_cumul = get_cumulative_freq(wheeler_rhb_agg21,'Zack Wheeler (RHB)', 2021)
In [84]:
wheeler_lhb_agg21 = get_aggregate_count(wheeler_lhb21, 'Zack Wheeler (LHB)','2021', False)
wheeler_lhb21_cumul = get_cumulative_freq(wheeler_lhb_agg21,'Zack Wheeler (LHB)', 2021)

Cumulative Relative Frequency: Pitcher vs. Batter (per AB), Pitcher vs. Batter (per Game)¶

In [85]:
# Get cumulative number pitches per AB for all unique ABs
pitch_counts_ab = data21.sort_values(
    ['game_date','game_pk','at_bat_number','pitch_number']).groupby(
    ['game_date','game_pk','player_name','at_bat_number']).cumcount() + 1

# Get cumulative number pitches per AB for each unique pitch, AB
pitch_type_counts_ab = data21.sort_values(
    ['game_date','game_pk','at_bat_number','pitch_number']).groupby(
    ['game_date','game_pk','player_name','at_bat_number','pitch_type']).cumcount() + 1
In [86]:
# Plot histogram of each pitch's cumulative frequency within each unique AB
sns.histplot(pitch_type_counts_ab/pitch_counts_ab)
plt.title('Cumulative Relative Frequency of Pitches per AB per Game')
Out[86]:
Text(0.5, 1.0, 'Cumulative Relative Frequency of Pitches per AB per Game')
In [87]:
# Get cumulative number pitches per AB for all unique batter v. pitcher matchup per game
pitch_counts_hitvpitch = data21.sort_values(
    ['game_date','game_pk','at_bat_number','pitch_number']).groupby(
    ['game_date','game_pk','player_name','batter']).cumcount() + 1

# Get cumulative number pitches per AB for all unique batter v. pitcher matchup per game for each unique pitch
pitch_type_counts_hitvpitch = data21.sort_values(
    ['game_date','game_pk','at_bat_number','pitch_number']).groupby(
    ['game_date','game_pk','player_name','batter','pitch_type']).cumcount() + 1
In [88]:
# Plot histogram of each pitch's cumulative frequency within each unique batter v. pitcher matchup per game

sns.histplot(pitch_type_counts_hitvpitch/pitch_counts_hitvpitch)
plt.title('Cumulative Relative Frequency of Pitches per Hitter v. Pitcher Matchup per Game')
Out[88]:
Text(0.5, 1.0, 'Cumulative Relative Frequency of Pitches per Hitter v. Pitcher Matchup per Game')

Cumulative Relative Frequency per Outing - 2021¶

In [89]:
# Get cumulative number pitches for all unique pitcher outings (start, relief)
pitch_counts = data21.sort_values(
    ['game_date','game_pk','at_bat_number','pitch_number']).groupby(
    ['game_date','game_pk','player_name']).cumcount() + 1

# Get cumulative number pitches  for all unique pitcher outings, for each unique pitch
pitch_type_counts = data21.sort_values(
    ['game_date','game_pk','at_bat_number','pitch_number']).groupby(
    ['game_date','game_pk','player_name','pitch_type']).cumcount() + 1
In [90]:
# Plot cumulative relative frequency for all pitches for each unique pitcher appearance
sns.histplot(pitch_type_counts/pitch_counts)
plt.title('Cumulative Relative Frequency of Pitches per Outing')
Out[90]:
Text(0.5, 1.0, 'Cumulative Relative Frequency of Pitches per Outing')

Feature Exploration: Velocity, Movement Differentials¶

Added after pitch classifications from clustering analysis¶

In [91]:
# Groups data by each game_date and pitcher, finds primary fastball (or primary pitch if no fastball)
primary_fb = data21.groupby(['game_date','game_pk','player_name']).agg({
    'pitch_type': lambda x: x[x.isin(['FC','SI','FF'])].value_counts().idxmax() 
    if any(x.isin(['FC','SI','FF'])) 
    else x.value_counts().idxmax()
}).rename(columns={'pitch_type':'primary_pitch'})
In [113]:
# Merges training data with primary fastball, defines new columns of primary_pitch
primary_fb_data = data21.merge(primary_fb.reset_index(), left_on=['game_date','game_pk','player_name','pitch_type'], 
             right_on=['game_date','game_pk','player_name','primary_pitch'], how='inner')
In [121]:
primary_fb_data
Out[121]:
delta_run_exp player_name p_throws batter stand pitch_type pitch_number home_team game_date game_pk ... ax ay az sz_top sz_bot inferred_axis axis_diff pitch_count game_week primary_pitch
0 0.038 Márquez, Germán R 605141 R FF 1 COL 2021-04-01 634615 ... -4.967683 25.527820 -18.815090 3.46 1.76 196.542983 21.623025 1 1 FF
1 -0.049 Márquez, Germán R 605141 R FF 2 COL 2021-04-01 634615 ... -4.405496 25.441128 -19.766641 3.29 1.49 195.859366 -0.859366 2 1 FF
2 0.052 Márquez, Germán R 605141 R FF 3 COL 2021-04-01 634615 ... -6.331036 25.767684 -20.177597 3.34 1.64 206.029592 1.970408 3 1 FF
3 0.113 Márquez, Germán R 605141 R FF 4 COL 2021-04-01 634615 ... -1.810341 24.856947 -23.067890 3.29 1.58 185.042451 33.123557 4 1 FF
4 -0.078 Márquez, Germán R 605141 R FF 5 COL 2021-04-01 634615 ... -3.312501 24.833759 -20.112617 3.22 1.55 191.181754 20.818246 5 1 FF
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
332042 -0.061 Gsellman, Robert R 594807 R SI 3 ATL 2021-10-03 632254 ... -18.151571 31.755423 -19.628348 3.41 1.56 236.901606 -15.901606 13 27 SI
332043 -0.038 Smith, Will L 641645 L FF 1 ATL 2021-10-03 632254 ... 10.379363 29.462506 -13.261773 3.31 1.55 153.279254 -7.279254 1 27 FF
332044 0.026 Smith, Will L 607043 L FF 1 ATL 2021-10-03 632254 ... 8.211265 31.770872 -13.621914 3.65 1.60 159.573975 -14.573975 4 27 FF
332045 -0.189 Smith, Will L 607043 L FF 2 ATL 2021-10-03 632254 ... 8.700586 30.117690 -15.941174 3.49 1.60 153.794165 -10.794165 5 27 FF
332046 -0.073 Smith, Will L 596019 R FF 4 ATL 2021-10-03 632254 ... 9.708393 26.562803 -14.083224 3.32 1.51 153.434949 -5.434949 9 27 FF

332047 rows × 40 columns

In [114]:
# Define velocity, movement variables to calculate differentials from primary pitch
velo_mvt_cols = ['release_speed','release_spin_rate','pfx_x', 'pfx_z', 
                 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'spin_axis', 'inferred_axis','axis_diff']

# Calculates mean for velocity, movement variables for each primary pitch for each pitcher per outing
primary_fb_means = primary_fb_data.groupby(['game_date','game_pk','player_name','pitch_type']).agg({
    i: 'mean' for i in velo_mvt_cols}).rename(columns = {
    i: i + '_mean' for i in velo_mvt_cols
}).reset_index()
primary_fb_means = primary_fb_means.rename(columns={'pitch_type':'primary_pitch'})

# Merges training data with primary pitch mean data
data21_merged = data21.merge(primary_fb_means, on = ['game_date','game_pk','player_name'], how = 'inner')
In [115]:
# Calculates all differentials for velocity and movenent profiles
data21_merged['velo_diff'] = data21_merged['release_speed'] - data21_merged['release_speed_mean']
data21_merged['spin_rate_diff'] = data21_merged['release_spin_rate'] - data21_merged['release_spin_rate_mean']
data21_merged['pfx_x_diff'] = data21_merged['pfx_x'] - data21_merged['pfx_x_mean']
data21_merged['pfx_z_diff'] = data21_merged['pfx_z'] - data21_merged['pfx_z_mean']
data21_merged['vx0_diff'] = data21_merged['vx0'] - data21_merged['vx0_mean']
data21_merged['vy0_diff'] = data21_merged['vy0'] - data21_merged['vy0_mean']
data21_merged['vz0_diff'] = data21_merged['vz0'] - data21_merged['vz0_mean']
data21_merged['ax_diff'] = data21_merged['ax'] - data21_merged['ax_mean']
data21_merged['ay_diff'] = data21_merged['ay'] - data21_merged['ay_mean']
data21_merged['az_diff'] = data21_merged['az'] - data21_merged['az_mean']
data21_merged['spin_axis_diff'] = data21_merged['spin_axis'] - data21_merged['spin_axis_mean']
data21_merged['inferred_axis_diff'] = data21_merged['inferred_axis'] - data21_merged['inferred_axis_mean']
data21_merged['axis_diff_diff'] = data21_merged['axis_diff'] - data21_merged['axis_diff_mean']


# Drops all primary pitch velocity, movement mean columns
data21_merged = data21_merged.drop(['primary_pitch'] + [i + '_mean' for i in velo_mvt_cols],axis = 1)
In [116]:
data21_merged
Out[116]:
delta_run_exp player_name p_throws batter stand pitch_type pitch_number home_team game_date game_pk ... pfx_z_diff vx0_diff vy0_diff vz0_diff ax_diff ay_diff az_diff spin_axis_diff inferred_axis_diff axis_diff_diff
0 0.038 Márquez, Germán R 605141 R FF 1 COL 2021-04-01 634615 ... 0.1714 0.042215 0.108148 3.347021 1.119505 0.065464 1.588882 10.479368 -7.466122 17.945490
1 -0.049 Márquez, Germán R 605141 R FF 2 COL 2021-04-01 634615 ... 0.0414 0.507318 -0.567653 -0.268293 1.681692 -0.021229 0.637331 -12.686640 -8.149739 -4.536901
2 0.052 Márquez, Germán R 605141 R FF 3 COL 2021-04-01 634615 ... 0.0214 -1.891575 -1.366921 1.789345 -0.243847 0.305327 0.226375 0.313360 2.020487 -1.707127
3 0.113 Márquez, Germán R 605141 R FF 4 COL 2021-04-01 634615 ... -0.1586 0.030702 -0.292753 3.321251 4.276848 -0.605410 -2.663917 10.479368 -18.966654 29.446022
4 -0.078 Márquez, Germán R 605141 R FF 5 COL 2021-04-01 634615 ... 0.0214 -0.282896 -1.062003 0.651255 2.774688 -0.628598 0.291355 4.313360 -12.827351 17.140711
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
704089 -0.189 Smith, Will L 607043 L FF 2 ATL 2021-10-03 632254 ... -0.1150 0.861282 1.087932 0.157783 -0.549316 0.639222 -1.714153 -2.500000 -1.226421 -1.273579
704090 0.016 Smith, Will L 596019 R CU 1 ATL 2021-10-03 632254 ... -2.0850 9.105589 24.382641 8.922196 -15.152558 -10.050905 -24.057726 184.500000 159.979414 24.520586
704091 -0.020 Smith, Will L 596019 R CU 2 ATL 2021-10-03 632254 ... -1.9050 4.752577 23.686689 5.488884 -14.122826 -8.876134 -22.035163 182.500000 153.097690 29.402310
704092 -0.027 Smith, Will L 596019 R SL 3 ATL 2021-10-03 632254 ... -0.9150 3.029527 16.157586 3.377736 -15.781025 -9.685077 -13.142093 169.500000 83.040965 86.459035
704093 -0.073 Smith, Will L 596019 R FF 4 ATL 2021-10-03 632254 ... -0.0150 -0.103284 -0.578015 -0.717265 0.458491 -2.915664 0.143797 2.500000 -1.585637 4.085637

704094 rows × 52 columns

Feature Exploration: "Noise" of Pitch Metrics¶

To be applied after clustering pitches for pitch classifications¶

In [96]:
# For "noise" around pitch trajectory, calculate a multivariate normal distribution
# for each unique pitch thrown for each pitcher over a season

# Note: Calculating multivariate distributions for eachn pitch per game is 
# both extremely computationally intensive, and each distribution unstable due to small samples
# of each pitch per game

# Note: Does not include axis_diff for this iteration

# Define all continuous features 
cont_feats = ['release_speed','release_extension','effective_speed',
'release_spin_rate','release_pos_x', 'release_pos_y', 'release_pos_z',
'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'spin_axis','inferred_axis','axis_diff']

def multivariate_normal_distribution(x):
    '''
    Description: Applied to each group, calculate multivariate normal distribution
    for each row's continuous features with mean and covariance matrix
    --------------------------------------------------------------------------------
    Inputs: x (dataframe row, Series)
    
    Returns: mvn_dist, SciPy multivariate normal distribution
    '''
    
    # Extract the continuous variables
    continuous_vars = x[cont_feats]  
    
    # Calculate the mean and covariance matrix for the continuous variables
    mean = continuous_vars.mean()
    cov_matrix = continuous_vars.cov().fillna(0) + (np.eye(continuous_vars.cov().shape[0]) * 1e-6)
    
    # Create a multivariate normal distribution object
    mvn_dist = multivariate_normal(mean=mean, cov=cov_matrix, allow_singular=True)
    
    return mvn_dist

# Applies multivariate normal to all unique pitches for each pitcher per year(> 3000 pitches per year)
pitch_noise_groups = data21.groupby(['player_name','pitch_type']).apply(
    multivariate_normal_distribution).reset_index(name = 'MV_Dist')
In [97]:
def calc_mahalanobis(x):
    '''
    Description: Calculates mahalanobis distance of each pitch's continuous features
    from center, inverse covariance matrix of distribution
    --------------------------------------------------------------------------------
    Inputs: x (type Series)
    
    Returns: mahalanobis_distance (type float)
    '''
    # Defines distribution, continuous features
    distribution = x[-1]
    data = np.array(x[:-1])
    
    # Calculates distance
    mahalanobis_distance= mahalanobis(data, distribution.mean, np.linalg.inv(distribution.cov))
    return mahalanobis_distance
In [98]:
# Merges dataframes of training data, dataframe with multivariate distributions,
# so each unique pitch's distribution included in column for each pitch in training data
data21_merged2 = data21.merge(pitch_noise_groups, on = ['player_name','pitch_type'], how = 'inner')

# Calculate mahalanobis distance for all unique pitch's continuous features based on 
# center and inverse covariance matrix of each pitch's multivariate distribution
data21_merged2['mahalanobis'] = data21_merged2[cont_feats + ['MV_Dist']].apply(
    calc_mahalanobis, axis = 1)

# Drops all multivariate normal distributions
data21_merged2 = data21_merged2.drop('MV_Dist',axis = 1)
data21_merged2
Out[98]:
delta_run_exp player_name p_throws batter stand pitch_type pitch_number home_team game_date game_pk ... ax ay az sz_top sz_bot inferred_axis axis_diff pitch_count game_week mahalanobis
0 0.038 Márquez, Germán R 605141 R FF 1 COL 2021-04-01 634615 ... -4.967683 25.527820 -18.815090 3.46 1.76 196.542983 21.623025 1 1 3.752184
1 -0.049 Márquez, Germán R 605141 R FF 2 COL 2021-04-01 634615 ... -4.405496 25.441128 -19.766641 3.29 1.49 195.859366 -0.859366 2 1 10.473990
2 0.052 Márquez, Germán R 605141 R FF 3 COL 2021-04-01 634615 ... -6.331036 25.767684 -20.177597 3.34 1.64 206.029592 1.970408 3 1 2.909655
3 0.113 Márquez, Germán R 605141 R FF 4 COL 2021-04-01 634615 ... -1.810341 24.856947 -23.067890 3.29 1.58 185.042451 33.123557 4 1 4.861900
4 -0.078 Márquez, Germán R 605141 R FF 5 COL 2021-04-01 634615 ... -3.312501 24.833759 -20.112617 3.22 1.55 191.181754 20.818246 5 1 3.393314
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
704089 0.000 Domínguez, Seranthony R 571918 R SL 5 MIA 2021-10-03 632246 ... 4.254631 26.821952 -30.382886 3.37 1.53 103.781597 -0.781597 7 27 1.500000
704090 -0.114 Domínguez, Seranthony R 571918 R SL 6 MIA 2021-10-03 632246 ... 4.214959 25.765611 -31.283437 3.37 1.53 96.458816 -22.458816 8 27 1.500000
704091 0.024 Domínguez, Seranthony R 663743 R SL 2 MIA 2021-10-03 632246 ... 7.495288 26.630297 -32.724664 3.32 1.56 81.304497 22.695503 10 27 1.500000
704092 0.043 Domínguez, Seranthony R 663743 R SL 5 MIA 2021-10-03 632246 ... 5.347618 27.557519 -30.424209 3.28 1.56 96.766175 8.233825 13 27 1.500000
704093 0.038 Alexander, Tyler L 660162 R CU 1 CWS 2021-10-03 632252 ... -1.969161 19.192514 -34.745736 3.40 1.54 317.121096 -42.121096 68 27 0.000000

704094 rows × 40 columns

Examples¶

In [99]:
name = 'Wheeler, Zack'
wheeler_21 = data21_merged2[data21_merged2.player_name == name]

sns.histplot(
    wheeler_21, x="mahalanobis", y="pitch_type", hue="pitch_type", legend=False
)

plt.title(f'Mahalanobis Distance by Pitch Type, Zack Wheeler 2021')
Out[99]:
Text(0.5, 1.0, 'Mahalanobis Distance by Pitch Type, Zack Wheeler 2021')
In [100]:
name = 'Kershaw, Clayton'
kershaw_21 = data21_merged2[data21_merged2.player_name == name]

sns.histplot(
    kershaw_21, x="mahalanobis", y="pitch_type", hue="pitch_type", legend=False
)

plt.title(f'Mahalanobis Distance by Pitch Type, Clayton Kershaw 2021')
Out[100]:
Text(0.5, 1.0, 'Mahalanobis Distance by Pitch Type, Clayton Kershaw 2021')
In [101]:
name = 'Jansen, Kenley'
jansen_21 = data21_merged2[data21_merged2.player_name == name]

sns.histplot(
    jansen_21, x="mahalanobis", y="pitch_type", hue="pitch_type", legend=False
)

plt.title(f'Mahalanobis Distance by Pitch Type, Kenley Jansen 2021')
Out[101]:
Text(0.5, 1.0, 'Mahalanobis Distance by Pitch Type, Kenley Jansen 2021')

Clustering¶

PCA Variance of Training Data¶

In [102]:
# Define column transformer for different scalers
pca_scale_features = ['release_speed','release_extension','effective_speed',
                          'release_spin_rate','release_pos_x', 'release_pos_y', 'release_pos_z',
                          'spin_axis','axis_diff', 'inferred_axis', 'pfx_x', 'pfx_z',
                          'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot']

#pca_cat_features = ['p_throws','stand','pitch_number',
#    'home_team','game_week','balls','strikes','outs_when_up']

pca_preprocessor = ColumnTransformer(
    transformers=[
        ('standard_scale', StandardScaler(), pca_scale_features), # StandardScaler for continuous features
        #('categorical', OneHotEncoder(), pca_cat_features) # OneHotEncoder for categorical features
    ])

# Define PCA as TruncatedSVD
pca_alldata = TruncatedSVD(n_components=10, random_state = 15)

# Create pipeline
pca_pipeline_alldata = Pipeline(steps=[
  ('preprocessor', pca_preprocessor),
  ('pca', pca_alldata)
])

# Concatenate all training data
# Apply PCA, plot variance for each of 10 principal components
all_data = pd.concat([data21, data22])

pca_pipeline_alldata.fit_transform(all_data)

# Plot the cumulative explained variance
variance = pca_pipeline_alldata.named_steps['pca'].explained_variance_ratio_

plt.figure(figsize=(10, 5))
plt.bar(range(1, len(variance) + 1), variance)
plt.title('Elbow Plot, PCA Training Data')
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance')
plt.grid(True)
plt.show()

Clustering Scheme¶

In [103]:
def create_scatterplot_subplot(input_data, x_vars, y_vars, hue_var, title = None, inp_ax = None):
    """
    Create a scatterplot with colors for categorical variables.
    Available to do multiple columns in a subplot or add a single plot to an existing figure.

    Parameters:
        df (DataFrame): The input DataFrame containing the data. (Filtered)
        x_vars (list of str): List of three variables for the x-axis.
        y_vars (list of str): List of three variables for the y-axis.
        hue_var (str): The variable for coloring the points.

    Returns: None 
    """
    # plot single scatterplot or multiple depending on number of variables passed
    if len(x_vars) == 1:
        
        sns.scatterplot(data= input_data, x=x_vars[0], 
                        y=y_vars[0], hue=hue_var, palette='Set1', ax = inp_ax)
        
        if inp_ax != None:
            inp_ax.set_title(title, fontsize=12)
    else:
        num_cols = len(x_vars)
        num_rows = 1  
        fig, axe = plt.subplots(num_rows, num_cols, figsize=(4*num_cols, 4))
        
        for i, (x_var, y_var) in enumerate(zip(x_vars, y_vars)):
            sns.scatterplot(data =input_data, x=x_var, y=y_var, hue=hue_var, palette='Set1',ax = axe[i])
            axe[i].set_title(f'{x_var} vs {y_var}')
            axe[i].set_xlabel(x_var)
            axe[i].set_ylabel(y_var)
In [104]:
def cluster_pitchdata(name, pitch_data, df_out):
    """
    Do clustering analysis of pitch types for each unique pitcher,
    Determine appropriate # of clusters, choose to add new classifications or keep old ones

    Parameters:
        name (str): Name of pitcher
        pitch_data (pandas DataFrame): DataFrame of pitch data
        df_out (pandas DataFrame): DataFrame of pitch data with clustering analysis done

    Returns: df_out, df with clustering analysis done
    """
    
    # Define dataframe of pitch data for specific pitcher, ground-truth pitch classifications
    df = pitch_data[pitch_data.player_name == name].reset_index(drop=True)
    df_hue = df.pitch_type.reset_index(drop=True)
    df = df.drop(['player_name','pitch_type'],axis=1)
    #,'batter','game_pk','game_date'

    
    # Define the column transformer for different scalers
    standardscale_features = ['release_speed','release_extension','effective_speed',
                          'release_spin_rate','release_pos_x', 'release_pos_y', 'release_pos_z',
                          'spin_axis','inferred_axis','axis_diff','pfx_x', 'pfx_z',
                          'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot']

    preprocessor = ColumnTransformer(
    transformers=[
        ('standard_scale', StandardScaler(), standardscale_features), # StandardScaler for continuous features
    ])

    # Fit PCA to 4 components
    pca = TruncatedSVD(n_components=4, random_state = 15)

    # Create pipeline, fit
    pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', pca)
    ])

    df_pca = pd.DataFrame(pipeline.fit_transform(df), 
                          columns = ['PC1','PC2','PC3','PC4'])

    # Plot PCA data to determine number of clusters to fit to data
    xpca_vars = ['PC1'] #['PC1','PC2','PC3','PC4']
    ypca_vars = ['PC2'] #['PC2','PC3','PC4','PC1']

    plt.figure(figsize=(4,4))
    create_scatterplot_subplot(df_pca, xpca_vars, ypca_vars, df_hue)
    plt.title(f'PCA: {name}')
    plt.show()
    
    
    print('Before Clustering:', ', '.join([f"{index}: {count}" for index, count in df_hue.value_counts().items()]))
    
    # Fit Gaussian Mixture Model (GMM) to the reduced data
    num_clusters = int(input('Specify Number of Clusters: '))  # Specify the number of clusters
    
    gmm = GaussianMixture(n_components=num_clusters, n_init = 30, random_state=42, max_iter = 500, 
                      tol=1e-8,init_params = 'k-means++')
    cluster_labels = gmm.fit_predict(df_pca)

    # Add cluster labels back to the original data
    df['pitch_type'] = cluster_labels

    # Calculate cluster centroids
    cluster_centroids = gmm.means_

    # Calculate the centroids of ground truth labels
    ground_truth_centroids = []
    ground_truth_labels = np.unique(df_hue)

    for label in ground_truth_labels:
        centroid = np.mean(df_pca[np.array(df_hue) == label], axis=0)
        ground_truth_centroids.append(centroid)

    # Assign clusters to ground truth labels based on centroid mahalanobis distances
    cluster_to_ground_truth_mapping, _ = pairwise_distances_argmin_min(
    X= cluster_centroids, Y= ground_truth_centroids,
    metric='mahalanobis', 
        metric_kwargs={'VI': np.linalg.inv(np.cov(cluster_centroids.T)+ np.identity(cluster_centroids.shape[1]))})

    # Index ground-truth labels to cluster labels
    # Cluster labels in order 0-x, cluster to ground-truth mapping in order of minimal mahalanobis distance to center
    ground_to_cluster_labels = [ground_truth_labels[i] for i in cluster_to_ground_truth_mapping]

    # Map indexed ground-truth labels to cluster labels
    mapping_dict = {i: ground_to_cluster_labels[i] for i in range(len(ground_to_cluster_labels))}

    # Map dict of cluster and ground-truth labels to dataframe
    df['pitch_type'] = df['pitch_type'].map(mapping_dict)

    # Print number of each pitch type  after clustering
    print('After Clustering: ', ', '.join([f"{index}: {count}" 
                                           for index, count in df['pitch_type'].value_counts().items()]))
    
    # Define new figure, plot PCA data before and after clustering analysis
    fig, axis = plt.subplots(1,2, figsize = (8, 4))
    create_scatterplot_subplot(df_pca, xpca_vars, ypca_vars, df_hue, 
                               title = f'PCA: {name}',inp_ax = axis[0])
    create_scatterplot_subplot(df_pca, xpca_vars, ypca_vars, df['pitch_type'], 
                           title = f'PCA (Clustered): {name}', inp_ax = axis[1])
    plt.show()
    
    # Manual component for saving/dropping clustering results from 
    cluster_eval = input('Is this clustering sufficient?: ')
    if cluster_eval == 'y':
        pass
    else:
        # Keep original pitch type labels
        df['pitch_type'] = df_hue
    
    df['player_name'] = name
    df_out = df_out.append(df)
    return df_out

Applied, Training Data¶

In [55]:
#data21_clus = pd.read_csv('Clustering2021.csv')
#data22_clus = pd.read_csv('Clustering2022.csv')
In [105]:
new_clus = pd.DataFrame()
In [106]:
for i in data22.player_name.unique()[785:]:
    index = np.where(i == data22.player_name.unique())[0][0]
    print(index, i)
    new_clus = cluster_pitchdata(i, data22, new_clus)
785 Ortiz, Luis L.
Before Clustering: FF: 118, SL: 99, SI: 41, CH: 14
Specify Number of Clusters: 4
After Clustering:  FF: 159, SL: 99, CH: 14
Is this clustering sufficient?: 
786 Espinal, Raynel
Before Clustering: FF: 29, CH: 29, SL: 17
Specify Number of Clusters: 3
After Clustering:  FF: 29, CH: 29, SL: 17
Is this clustering sufficient?: y
787 Dowdy, Kyle
Before Clustering: CU: 38, FF: 36, FC: 21
Specify Number of Clusters: 3
After Clustering:  CU: 38, FF: 36, FC: 21
Is this clustering sufficient?: y
788 Wiles, Collin
Before Clustering: FC: 63, SI: 27, FF: 24, CU: 19, SL: 15, CH: 5
Specify Number of Clusters: 6
After Clustering:  FC: 63, SI: 56, SL: 24, CU: 10
Is this clustering sufficient?: 
789 Jameson, Drey
Before Clustering: FF: 127, SI: 103, SL: 89, CH: 31, CU: 16
Specify Number of Clusters: 5
After Clustering:  FF: 125, SL: 111, SI: 93, CH: 37
Is this clustering sufficient?: 
790 Alzolay, Adbert
Before Clustering: FF: 54, SL: 46, FC: 22, SI: 18, CH: 10
Specify Number of Clusters: 6
After Clustering:  FF: 72, SL: 45, FC: 23, CH: 10
Is this clustering sufficient?: 
791 German, Frank
Before Clustering: FF: 58, SL: 24, CH: 2
Specify Number of Clusters: 3
After Clustering:  FF: 58, SL: 26
Is this clustering sufficient?: 
792 Hollowell, Gavin
Before Clustering: SL: 47, SI: 38, FF: 32, CH: 1
Specify Number of Clusters: 4
After Clustering:  SL: 48, SI: 37, FF: 33
Is this clustering sufficient?: 
793 Henriquez, Ronny
Before Clustering: SL: 82, CH: 53, FF: 41, SI: 3
Specify Number of Clusters: 4
After Clustering:  SL: 81, CH: 54, FF: 44
Is this clustering sufficient?: t
794 Miller, Shelby
Before Clustering: SL: 48, FF: 41
Specify Number of Clusters: 2
After Clustering:  SL: 48, FF: 41
Is this clustering sufficient?: y
795 Britton, Zack
Before Clustering: SI: 32, SL: 7
Specify Number of Clusters: 2
After Clustering:  SI: 32, SL: 7
Is this clustering sufficient?: y
796 Glasnow, Tyler
Before Clustering: FF: 24, SL: 14, CU: 10
Specify Number of Clusters: 3
After Clustering:  FF: 24, SL: 24
Is this clustering sufficient?: 
797 Díaz, Miguel
Before Clustering: CH: 9, FF: 4, SI: 3, SL: 2
Specify Number of Clusters: 4
After Clustering:  CH: 11, FF: 5, SL: 2
Is this clustering sufficient?: 
798 Woods Richardson, Simeon
Before Clustering: FF: 50, SL: 17, CU: 13, CH: 11
Specify Number of Clusters: 4
After Clustering:  FF: 50, SL: 25, CH: 12, CU: 4
Is this clustering sufficient?: 
799 McGee, Easton
Before Clustering: SL: 19, SI: 15, FC: 6, CH: 3, CU: 2, FF: 1
Specify Number of Clusters: 6
After Clustering:  SL: 18, SI: 17, CH: 7, CU: 3, FC: 1
Is this clustering sufficient?: 
In [ ]:
clus22 = pd.concat([data22_clus,new_clus])
In [ ]:
clus22.to_csv('Clustering2022.csv', index=False)
In [ ]: